1dc7e38acSHans Petter Selasky /*- 28d1eeedbSHans Petter Selasky * Copyright (c) 2013-2019, Mellanox Technologies, Ltd. All rights reserved. 3dc7e38acSHans Petter Selasky * 4dc7e38acSHans Petter Selasky * Redistribution and use in source and binary forms, with or without 5dc7e38acSHans Petter Selasky * modification, are permitted provided that the following conditions 6dc7e38acSHans Petter Selasky * are met: 7dc7e38acSHans Petter Selasky * 1. Redistributions of source code must retain the above copyright 8dc7e38acSHans Petter Selasky * notice, this list of conditions and the following disclaimer. 9dc7e38acSHans Petter Selasky * 2. Redistributions in binary form must reproduce the above copyright 10dc7e38acSHans Petter Selasky * notice, this list of conditions and the following disclaimer in the 11dc7e38acSHans Petter Selasky * documentation and/or other materials provided with the distribution. 12dc7e38acSHans Petter Selasky * 13dc7e38acSHans Petter Selasky * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14dc7e38acSHans Petter Selasky * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15dc7e38acSHans Petter Selasky * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16dc7e38acSHans Petter Selasky * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17dc7e38acSHans Petter Selasky * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18dc7e38acSHans Petter Selasky * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19dc7e38acSHans Petter Selasky * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20dc7e38acSHans Petter Selasky * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21dc7e38acSHans Petter Selasky * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22dc7e38acSHans Petter Selasky * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23dc7e38acSHans Petter Selasky * SUCH DAMAGE. 24dc7e38acSHans Petter Selasky * 25dc7e38acSHans Petter Selasky * $FreeBSD$ 26dc7e38acSHans Petter Selasky */ 27dc7e38acSHans Petter Selasky 28ee9d634bSKonstantin Belousov #include "opt_rss.h" 29ee9d634bSKonstantin Belousov #include "opt_ratelimit.h" 30ee9d634bSKonstantin Belousov 31dc7e38acSHans Petter Selasky #include <linux/kernel.h> 32dc7e38acSHans Petter Selasky #include <linux/module.h> 33dc7e38acSHans Petter Selasky #include <linux/random.h> 34dc7e38acSHans Petter Selasky #include <linux/vmalloc.h> 35a2485fe5SHans Petter Selasky #include <linux/hardirq.h> 361900b6f8SHans Petter Selasky #include <linux/delay.h> 37dc7e38acSHans Petter Selasky #include <dev/mlx5/driver.h> 38dc7e38acSHans Petter Selasky #include <dev/mlx5/mlx5_ifc.h> 3912c56d7dSHans Petter Selasky #include <dev/mlx5/mlx5_core/mlx5_core.h> 40dc7e38acSHans Petter Selasky 41dc7e38acSHans Petter Selasky #define MLX5_HEALTH_POLL_INTERVAL (2 * HZ) 42dc7e38acSHans Petter Selasky #define MAX_MISSES 3 43dc7e38acSHans Petter Selasky 44a2485fe5SHans Petter Selasky enum { 45ca551594SHans Petter Selasky MLX5_DROP_NEW_HEALTH_WORK, 46519774eaSHans Petter Selasky MLX5_DROP_NEW_RECOVERY_WORK, 47adb6fd50SHans Petter Selasky MLX5_DROP_NEW_WATCHDOG_WORK, 48ca551594SHans Petter Selasky }; 49ca551594SHans Petter Selasky 501900b6f8SHans Petter Selasky enum { 511900b6f8SHans Petter Selasky MLX5_SENSOR_NO_ERR = 0, 521900b6f8SHans Petter Selasky MLX5_SENSOR_PCI_COMM_ERR = 1, 531900b6f8SHans Petter Selasky MLX5_SENSOR_PCI_ERR = 2, 541900b6f8SHans Petter Selasky MLX5_SENSOR_NIC_DISABLED = 3, 551900b6f8SHans Petter Selasky MLX5_SENSOR_NIC_SW_RESET = 4, 56fe242ba7SHans Petter Selasky MLX5_SENSOR_FW_SYND_RFR = 5, 571900b6f8SHans Petter Selasky }; 581900b6f8SHans Petter Selasky 5929e54451SSlava Shwartsman static int mlx5_fw_reset_enable = 1; 6029e54451SSlava Shwartsman SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN, 6129e54451SSlava Shwartsman &mlx5_fw_reset_enable, 0, 6229e54451SSlava Shwartsman "Enable firmware reset"); 6329e54451SSlava Shwartsman 645169fb81SHans Petter Selasky static unsigned int sw_reset_to = 1200; 655169fb81SHans Petter Selasky SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN, 665169fb81SHans Petter Selasky &sw_reset_to, 0, 675169fb81SHans Petter Selasky "Minimum timeout in seconds between two firmware resets"); 685169fb81SHans Petter Selasky 695169fb81SHans Petter Selasky 70b575d8c8SHans Petter Selasky static int lock_sem_sw_reset(struct mlx5_core_dev *dev) 71f20b553dSHans Petter Selasky { 72b575d8c8SHans Petter Selasky int ret; 73f20b553dSHans Petter Selasky 74f20b553dSHans Petter Selasky /* Lock GW access */ 75b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock(dev); 76f20b553dSHans Petter Selasky if (ret) { 77b575d8c8SHans Petter Selasky mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret); 78f20b553dSHans Petter Selasky return ret; 79f20b553dSHans Petter Selasky } 80f20b553dSHans Petter Selasky 81b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET); 82b575d8c8SHans Petter Selasky if (ret) { 83f20b553dSHans Petter Selasky if (ret == -EBUSY) 84a2f4f59cSHans Petter Selasky mlx5_core_dbg(dev, 85a2f4f59cSHans Petter Selasky "SW reset FW semaphore already locked, another function will handle the reset\n"); 86f20b553dSHans Petter Selasky else 87a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 88a2f4f59cSHans Petter Selasky "SW reset semaphore lock return %d\n", ret); 89f20b553dSHans Petter Selasky } 90f20b553dSHans Petter Selasky 91f20b553dSHans Petter Selasky /* Unlock GW access */ 92b575d8c8SHans Petter Selasky mlx5_vsc_unlock(dev); 93b575d8c8SHans Petter Selasky 94b575d8c8SHans Petter Selasky return ret; 95b575d8c8SHans Petter Selasky } 96b575d8c8SHans Petter Selasky 97b575d8c8SHans Petter Selasky static int unlock_sem_sw_reset(struct mlx5_core_dev *dev) 98b575d8c8SHans Petter Selasky { 99b575d8c8SHans Petter Selasky int ret; 100b575d8c8SHans Petter Selasky 101b575d8c8SHans Petter Selasky /* Lock GW access */ 102b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock(dev); 103b575d8c8SHans Petter Selasky if (ret) { 104b575d8c8SHans Petter Selasky mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret); 105b575d8c8SHans Petter Selasky return ret; 106b575d8c8SHans Petter Selasky } 107b575d8c8SHans Petter Selasky 108b575d8c8SHans Petter Selasky ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET); 109b575d8c8SHans Petter Selasky 110b575d8c8SHans Petter Selasky /* Unlock GW access */ 111b575d8c8SHans Petter Selasky mlx5_vsc_unlock(dev); 112f20b553dSHans Petter Selasky 113f20b553dSHans Petter Selasky return ret; 114f20b553dSHans Petter Selasky } 115f20b553dSHans Petter Selasky 116ba11bcecSHans Petter Selasky u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) 117a2485fe5SHans Petter Selasky { 1181900b6f8SHans Petter Selasky return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; 119a2485fe5SHans Petter Selasky } 120a2485fe5SHans Petter Selasky 121ba11bcecSHans Petter Selasky void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) 122ba11bcecSHans Petter Selasky { 123ba11bcecSHans Petter Selasky u32 cur_cmdq_addr_l_sz; 124ba11bcecSHans Petter Selasky 125ba11bcecSHans Petter Selasky cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz); 126ba11bcecSHans Petter Selasky iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) | 127ba11bcecSHans Petter Selasky state << MLX5_NIC_IFC_OFFSET, 128ba11bcecSHans Petter Selasky &dev->iseg->cmdq_addr_l_sz); 129ba11bcecSHans Petter Selasky } 130ba11bcecSHans Petter Selasky 131fe242ba7SHans Petter Selasky static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) 132fe242ba7SHans Petter Selasky { 133fe242ba7SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 134fe242ba7SHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health; 135fe242ba7SHans Petter Selasky u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; 136fe242ba7SHans Petter Selasky u8 synd = ioread8(&h->synd); 137fe242ba7SHans Petter Selasky 138fe242ba7SHans Petter Selasky if (rfr && synd) 139fe242ba7SHans Petter Selasky mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); 140fe242ba7SHans Petter Selasky return rfr && synd; 141fe242ba7SHans Petter Selasky } 142fe242ba7SHans Petter Selasky 143a0a4fd77SHans Petter Selasky static void mlx5_trigger_cmd_completions(struct work_struct *work) 144a2485fe5SHans Petter Selasky { 145a0a4fd77SHans Petter Selasky struct mlx5_core_dev *dev = 146a0a4fd77SHans Petter Selasky container_of(work, struct mlx5_core_dev, priv.health.work_cmd_completion); 147a2485fe5SHans Petter Selasky unsigned long flags; 148a2485fe5SHans Petter Selasky u64 vector; 149a2485fe5SHans Petter Selasky 150a2485fe5SHans Petter Selasky /* wait for pending handlers to complete */ 151a2485fe5SHans Petter Selasky synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); 152a2485fe5SHans Petter Selasky spin_lock_irqsave(&dev->cmd.alloc_lock, flags); 153a2485fe5SHans Petter Selasky vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); 154a2485fe5SHans Petter Selasky if (!vector) 155a2485fe5SHans Petter Selasky goto no_trig; 156a2485fe5SHans Petter Selasky 157a2485fe5SHans Petter Selasky vector |= MLX5_TRIGGERED_CMD_COMP; 158a2485fe5SHans Petter Selasky spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 159a2485fe5SHans Petter Selasky 1602cec1528SKonstantin Belousov mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector); 161721a1a6aSSlava Shwartsman mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS); 162a2485fe5SHans Petter Selasky return; 163a2485fe5SHans Petter Selasky 164a2485fe5SHans Petter Selasky no_trig: 165a2485fe5SHans Petter Selasky spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 166a2485fe5SHans Petter Selasky } 167a2485fe5SHans Petter Selasky 1681900b6f8SHans Petter Selasky static bool sensor_pci_no_comm(struct mlx5_core_dev *dev) 169a2485fe5SHans Petter Selasky { 170a2485fe5SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 171a2485fe5SHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health; 1721900b6f8SHans Petter Selasky bool err = ioread32be(&h->fw_ver) == 0xffffffff; 173a2485fe5SHans Petter Selasky 1741900b6f8SHans Petter Selasky return err; 1751900b6f8SHans Petter Selasky } 176a2485fe5SHans Petter Selasky 1771900b6f8SHans Petter Selasky static bool sensor_nic_disabled(struct mlx5_core_dev *dev) 1781900b6f8SHans Petter Selasky { 179ba11bcecSHans Petter Selasky return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED; 1801900b6f8SHans Petter Selasky } 181a2485fe5SHans Petter Selasky 1821900b6f8SHans Petter Selasky static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev) 1831900b6f8SHans Petter Selasky { 184ba11bcecSHans Petter Selasky return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET; 1851900b6f8SHans Petter Selasky } 1861900b6f8SHans Petter Selasky 1871900b6f8SHans Petter Selasky static u32 check_fatal_sensors(struct mlx5_core_dev *dev) 1881900b6f8SHans Petter Selasky { 1891900b6f8SHans Petter Selasky if (sensor_pci_no_comm(dev)) 1901900b6f8SHans Petter Selasky return MLX5_SENSOR_PCI_COMM_ERR; 1911900b6f8SHans Petter Selasky if (pci_channel_offline(dev->pdev)) 1921900b6f8SHans Petter Selasky return MLX5_SENSOR_PCI_ERR; 1931900b6f8SHans Petter Selasky if (sensor_nic_disabled(dev)) 1941900b6f8SHans Petter Selasky return MLX5_SENSOR_NIC_DISABLED; 1951900b6f8SHans Petter Selasky if (sensor_nic_sw_reset(dev)) 1961900b6f8SHans Petter Selasky return MLX5_SENSOR_NIC_SW_RESET; 197fe242ba7SHans Petter Selasky if (sensor_fw_synd_rfr(dev)) 198fe242ba7SHans Petter Selasky return MLX5_SENSOR_FW_SYND_RFR; 1991900b6f8SHans Petter Selasky 2001900b6f8SHans Petter Selasky return MLX5_SENSOR_NO_ERR; 201a2485fe5SHans Petter Selasky } 202a2485fe5SHans Petter Selasky 203fe242ba7SHans Petter Selasky static void reset_fw_if_needed(struct mlx5_core_dev *dev) 204fe242ba7SHans Petter Selasky { 20529e54451SSlava Shwartsman bool supported; 206fe242ba7SHans Petter Selasky u32 cmdq_addr, fatal_error; 207fe242ba7SHans Petter Selasky 20829e54451SSlava Shwartsman if (!mlx5_fw_reset_enable) 20929e54451SSlava Shwartsman return; 21029e54451SSlava Shwartsman supported = (ioread32be(&dev->iseg->initializing) >> 21129e54451SSlava Shwartsman MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; 212fe242ba7SHans Petter Selasky if (!supported) 213fe242ba7SHans Petter Selasky return; 214fe242ba7SHans Petter Selasky 215fe242ba7SHans Petter Selasky /* The reset only needs to be issued by one PF. The health buffer is 216fe242ba7SHans Petter Selasky * shared between all functions, and will be cleared during a reset. 217fe242ba7SHans Petter Selasky * Check again to avoid a redundant 2nd reset. If the fatal erros was 218fe242ba7SHans Petter Selasky * PCI related a reset won't help. 219fe242ba7SHans Petter Selasky */ 220fe242ba7SHans Petter Selasky fatal_error = check_fatal_sensors(dev); 221fe242ba7SHans Petter Selasky if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || 222fe242ba7SHans Petter Selasky fatal_error == MLX5_SENSOR_NIC_DISABLED || 223d28b6b55SHans Petter Selasky fatal_error == MLX5_SENSOR_NIC_SW_RESET) { 224a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 225a2f4f59cSHans Petter Selasky "Not issuing FW reset. Either it's already done or won't help.\n"); 226fe242ba7SHans Petter Selasky return; 227fe242ba7SHans Petter Selasky } 228fe242ba7SHans Petter Selasky 229a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "Issuing FW Reset\n"); 230fe242ba7SHans Petter Selasky /* Write the NIC interface field to initiate the reset, the command 231fe242ba7SHans Petter Selasky * interface address also resides here, don't overwrite it. 232fe242ba7SHans Petter Selasky */ 233fe242ba7SHans Petter Selasky cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz); 234fe242ba7SHans Petter Selasky iowrite32be((cmdq_addr & 0xFFFFF000) | 235fe242ba7SHans Petter Selasky MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET, 236fe242ba7SHans Petter Selasky &dev->iseg->cmdq_addr_l_sz); 237fe242ba7SHans Petter Selasky } 238fe242ba7SHans Petter Selasky 2395169fb81SHans Petter Selasky static bool 2405169fb81SHans Petter Selasky mlx5_health_allow_reset(struct mlx5_core_dev *dev) 2415169fb81SHans Petter Selasky { 2425169fb81SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 2435169fb81SHans Petter Selasky unsigned int delta; 2445169fb81SHans Petter Selasky bool ret; 2455169fb81SHans Petter Selasky 2465169fb81SHans Petter Selasky if (health->last_reset_req != 0) { 2475169fb81SHans Petter Selasky delta = ticks - health->last_reset_req; 2485169fb81SHans Petter Selasky delta /= hz; 2495169fb81SHans Petter Selasky ret = delta >= sw_reset_to; 2505169fb81SHans Petter Selasky } else { 2515169fb81SHans Petter Selasky ret = true; 2525169fb81SHans Petter Selasky } 2535169fb81SHans Petter Selasky 2545169fb81SHans Petter Selasky /* 2555169fb81SHans Petter Selasky * In principle, ticks may be 0. Setting it to off by one (-1) 2565169fb81SHans Petter Selasky * to prevent certain reset in next request. 2575169fb81SHans Petter Selasky */ 2585169fb81SHans Petter Selasky health->last_reset_req = ticks ? : -1; 2595169fb81SHans Petter Selasky if (!ret) 260a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 261a2f4f59cSHans Petter Selasky "Firmware reset elided due to auto-reset frequency threshold.\n"); 2625169fb81SHans Petter Selasky return (ret); 2635169fb81SHans Petter Selasky } 2645169fb81SHans Petter Selasky 265d28b6b55SHans Petter Selasky #define MLX5_CRDUMP_WAIT_MS 60000 266d28b6b55SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS 1000 267d28b6b55SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS 5 268c0902569SHans Petter Selasky void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) 269a2485fe5SHans Petter Selasky { 270c2a1e807SHans Petter Selasky int end, delay_ms = MLX5_CRDUMP_WAIT_MS; 271d28b6b55SHans Petter Selasky u32 fatal_error; 272d28b6b55SHans Petter Selasky int lock = -EBUSY; 273d28b6b55SHans Petter Selasky 274d28b6b55SHans Petter Selasky fatal_error = check_fatal_sensors(dev); 275d28b6b55SHans Petter Selasky 276d28b6b55SHans Petter Selasky if (fatal_error || force) { 2776d54b22dSHans Petter Selasky if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) == 2786d54b22dSHans Petter Selasky MLX5_DEVICE_STATE_INTERNAL_ERROR) 2796d54b22dSHans Petter Selasky return; 2801fb6089cSHans Petter Selasky if (!force) 2811fb6089cSHans Petter Selasky mlx5_core_err(dev, "internal state error detected\n"); 282a0a4fd77SHans Petter Selasky 283a0a4fd77SHans Petter Selasky /* 284a0a4fd77SHans Petter Selasky * Queue the command completion handler on the command 285a0a4fd77SHans Petter Selasky * work queue to avoid racing with the real command 286a0a4fd77SHans Petter Selasky * completion handler and then wait for it to 287a0a4fd77SHans Petter Selasky * complete: 288a0a4fd77SHans Petter Selasky */ 2898d1eeedbSHans Petter Selasky queue_work(dev->priv.health.wq_cmd, &dev->priv.health.work_cmd_completion); 2908d1eeedbSHans Petter Selasky flush_workqueue(dev->priv.health.wq_cmd); 2917053deebSHans Petter Selasky } 292a2485fe5SHans Petter Selasky 2936d54b22dSHans Petter Selasky mutex_lock(&dev->intf_state_mutex); 2946d54b22dSHans Petter Selasky 295d28b6b55SHans Petter Selasky if (force) 296d28b6b55SHans Petter Selasky goto err_state_done; 297d28b6b55SHans Petter Selasky 2985169fb81SHans Petter Selasky if (fatal_error == MLX5_SENSOR_FW_SYND_RFR && 2995169fb81SHans Petter Selasky mlx5_health_allow_reset(dev)) { 30092d23c82SHans Petter Selasky /* Get cr-dump and reset FW semaphore */ 301d28b6b55SHans Petter Selasky if (mlx5_core_is_pf(dev)) 302b575d8c8SHans Petter Selasky lock = lock_sem_sw_reset(dev); 303d28b6b55SHans Petter Selasky 30492d23c82SHans Petter Selasky /* Execute cr-dump and SW reset */ 305d28b6b55SHans Petter Selasky if (lock != -EBUSY) { 3063e40712eSHans Petter Selasky (void)mlx5_fwdump(dev); 307d28b6b55SHans Petter Selasky reset_fw_if_needed(dev); 308d28b6b55SHans Petter Selasky delay_ms = MLX5_FW_RESET_WAIT_MS; 309d28b6b55SHans Petter Selasky } 310d28b6b55SHans Petter Selasky } 311d28b6b55SHans Petter Selasky 312d28b6b55SHans Petter Selasky /* Recover from SW reset */ 313d28b6b55SHans Petter Selasky end = jiffies + msecs_to_jiffies(delay_ms); 314d28b6b55SHans Petter Selasky do { 315d28b6b55SHans Petter Selasky if (sensor_nic_disabled(dev)) 316d28b6b55SHans Petter Selasky break; 317d28b6b55SHans Petter Selasky 318d28b6b55SHans Petter Selasky msleep(MLX5_NIC_STATE_POLL_MS); 319d28b6b55SHans Petter Selasky } while (!time_after(jiffies, end)); 320d28b6b55SHans Petter Selasky 321d28b6b55SHans Petter Selasky if (!sensor_nic_disabled(dev)) { 322a2f4f59cSHans Petter Selasky mlx5_core_err(dev, "NIC IFC still %d after %ums.\n", 323ba11bcecSHans Petter Selasky mlx5_get_nic_state(dev), delay_ms); 324d28b6b55SHans Petter Selasky } 325d28b6b55SHans Petter Selasky 326d28b6b55SHans Petter Selasky /* Release FW semaphore if you are the lock owner */ 327d28b6b55SHans Petter Selasky if (!lock) 328b575d8c8SHans Petter Selasky unlock_sem_sw_reset(dev); 329d28b6b55SHans Petter Selasky 330a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "System error event triggered\n"); 3317053deebSHans Petter Selasky 332d28b6b55SHans Petter Selasky err_state_done: 333843a89d3SSlava Shwartsman mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1); 3347053deebSHans Petter Selasky mutex_unlock(&dev->intf_state_mutex); 335a2485fe5SHans Petter Selasky } 336a2485fe5SHans Petter Selasky 337a2485fe5SHans Petter Selasky static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) 338a2485fe5SHans Petter Selasky { 339ba11bcecSHans Petter Selasky u8 nic_mode = mlx5_get_nic_state(dev); 340a2485fe5SHans Petter Selasky 3411900b6f8SHans Petter Selasky if (nic_mode == MLX5_NIC_IFC_SW_RESET) { 3421900b6f8SHans Petter Selasky /* The IFC mode field is 3 bits, so it will read 0x7 in two cases: 3431900b6f8SHans Petter Selasky * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded 3441900b6f8SHans Petter Selasky * and this is a VF), this is not recoverable by SW reset. 3451900b6f8SHans Petter Selasky * Logging of this is handled elsewhere. 3461900b6f8SHans Petter Selasky * 2. FW reset has been issued by another function, driver can 3471900b6f8SHans Petter Selasky * be reloaded to recover after the mode switches to 3481900b6f8SHans Petter Selasky * MLX5_NIC_IFC_DISABLED. 3491900b6f8SHans Petter Selasky */ 3501900b6f8SHans Petter Selasky if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) 351a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 352a2f4f59cSHans Petter Selasky "NIC SW reset is already progress\n"); 3531900b6f8SHans Petter Selasky else 354a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 355a2f4f59cSHans Petter Selasky "Communication with FW over the PCI link is down\n"); 3561900b6f8SHans Petter Selasky } else { 3571900b6f8SHans Petter Selasky mlx5_core_warn(dev, "NIC mode %d\n", nic_mode); 358a2485fe5SHans Petter Selasky } 359a2485fe5SHans Petter Selasky 360a2485fe5SHans Petter Selasky mlx5_disable_device(dev); 361a2485fe5SHans Petter Selasky } 362dc7e38acSHans Petter Selasky 3631900b6f8SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS 1000 3641900b6f8SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS 5 3654bb7662bSHans Petter Selasky static void health_recover(struct work_struct *work) 3664bb7662bSHans Petter Selasky { 3671900b6f8SHans Petter Selasky unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS); 3684bb7662bSHans Petter Selasky struct mlx5_core_health *health; 3694bb7662bSHans Petter Selasky struct delayed_work *dwork; 3704bb7662bSHans Petter Selasky struct mlx5_core_dev *dev; 3714bb7662bSHans Petter Selasky struct mlx5_priv *priv; 372f20b553dSHans Petter Selasky bool recover = true; 3731900b6f8SHans Petter Selasky u8 nic_mode; 3744bb7662bSHans Petter Selasky 3754bb7662bSHans Petter Selasky dwork = container_of(work, struct delayed_work, work); 3764bb7662bSHans Petter Selasky health = container_of(dwork, struct mlx5_core_health, recover_work); 3774bb7662bSHans Petter Selasky priv = container_of(health, struct mlx5_priv, health); 3784bb7662bSHans Petter Selasky dev = container_of(priv, struct mlx5_core_dev, priv); 3794bb7662bSHans Petter Selasky 380c6df6f53SWarner Losh /* This might likely be wrong, cut and paste from elsewhere? */ 381c6df6f53SWarner Losh bus_topo_lock(); 382ca2345a0SHans Petter Selasky 3831900b6f8SHans Petter Selasky if (sensor_pci_no_comm(dev)) { 384a2f4f59cSHans Petter Selasky mlx5_core_err(dev, 385a2f4f59cSHans Petter Selasky "health recovery flow aborted, PCI reads still not working\n"); 386f20b553dSHans Petter Selasky recover = false; 3871900b6f8SHans Petter Selasky } 3881900b6f8SHans Petter Selasky 389ba11bcecSHans Petter Selasky nic_mode = mlx5_get_nic_state(dev); 3901900b6f8SHans Petter Selasky while (nic_mode != MLX5_NIC_IFC_DISABLED && 3911900b6f8SHans Petter Selasky !time_after(jiffies, end)) { 3921900b6f8SHans Petter Selasky msleep(MLX5_NIC_STATE_POLL_MS); 393ba11bcecSHans Petter Selasky nic_mode = mlx5_get_nic_state(dev); 3941900b6f8SHans Petter Selasky } 3951900b6f8SHans Petter Selasky 3961900b6f8SHans Petter Selasky if (nic_mode != MLX5_NIC_IFC_DISABLED) { 397a2f4f59cSHans Petter Selasky mlx5_core_err(dev, 398a2f4f59cSHans Petter Selasky "health recovery flow aborted, unexpected NIC IFC mode %d.\n", 3991900b6f8SHans Petter Selasky nic_mode); 400f20b553dSHans Petter Selasky recover = false; 4014bb7662bSHans Petter Selasky } 4024bb7662bSHans Petter Selasky 403f20b553dSHans Petter Selasky if (recover) { 404a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "Starting health recovery flow\n"); 4054bb7662bSHans Petter Selasky mlx5_recover_device(dev); 4064bb7662bSHans Petter Selasky } 407ca2345a0SHans Petter Selasky 408c6df6f53SWarner Losh bus_topo_unlock(); 409f20b553dSHans Petter Selasky } 4104bb7662bSHans Petter Selasky 4114bb7662bSHans Petter Selasky /* How much time to wait until health resetting the driver (in msecs) */ 4124bb7662bSHans Petter Selasky #define MLX5_RECOVERY_DELAY_MSECS 60000 4131900b6f8SHans Petter Selasky #define MLX5_RECOVERY_NO_DELAY 0 4141900b6f8SHans Petter Selasky static unsigned long get_recovery_delay(struct mlx5_core_dev *dev) 4151900b6f8SHans Petter Selasky { 4161900b6f8SHans Petter Selasky return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR || 4171900b6f8SHans Petter Selasky dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR ? 4181900b6f8SHans Petter Selasky MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY; 4191900b6f8SHans Petter Selasky } 4201900b6f8SHans Petter Selasky 421dc7e38acSHans Petter Selasky static void health_care(struct work_struct *work) 422dc7e38acSHans Petter Selasky { 423a2485fe5SHans Petter Selasky struct mlx5_core_health *health; 4241900b6f8SHans Petter Selasky unsigned long recover_delay; 425dc7e38acSHans Petter Selasky struct mlx5_core_dev *dev; 426dc7e38acSHans Petter Selasky struct mlx5_priv *priv; 4274bb7662bSHans Petter Selasky unsigned long flags; 428dc7e38acSHans Petter Selasky 429a2485fe5SHans Petter Selasky health = container_of(work, struct mlx5_core_health, work); 430dc7e38acSHans Petter Selasky priv = container_of(health, struct mlx5_priv, health); 431dc7e38acSHans Petter Selasky dev = container_of(priv, struct mlx5_core_dev, priv); 432f20b553dSHans Petter Selasky 433dc7e38acSHans Petter Selasky mlx5_core_warn(dev, "handling bad device here\n"); 434a2485fe5SHans Petter Selasky mlx5_handle_bad_state(dev); 4351900b6f8SHans Petter Selasky recover_delay = msecs_to_jiffies(get_recovery_delay(dev)); 4364bb7662bSHans Petter Selasky 4374bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 438fe242ba7SHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) { 439a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 440a2f4f59cSHans Petter Selasky "Scheduling recovery work with %lums delay\n", 441fe242ba7SHans Petter Selasky recover_delay); 4424bb7662bSHans Petter Selasky schedule_delayed_work(&health->recover_work, recover_delay); 443fe242ba7SHans Petter Selasky } else { 444a2f4f59cSHans Petter Selasky mlx5_core_err(dev, 4454bb7662bSHans Petter Selasky "new health works are not permitted at this stage\n"); 446fe242ba7SHans Petter Selasky } 4474bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 448dc7e38acSHans Petter Selasky } 449a2485fe5SHans Petter Selasky 450a2485fe5SHans Petter Selasky static int get_next_poll_jiffies(void) 451a2485fe5SHans Petter Selasky { 452a2485fe5SHans Petter Selasky unsigned long next; 453a2485fe5SHans Petter Selasky 454a2485fe5SHans Petter Selasky get_random_bytes(&next, sizeof(next)); 455a2485fe5SHans Petter Selasky next %= HZ; 456a2485fe5SHans Petter Selasky next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 457a2485fe5SHans Petter Selasky 458a2485fe5SHans Petter Selasky return next; 459dc7e38acSHans Petter Selasky } 460dc7e38acSHans Petter Selasky 4614bb7662bSHans Petter Selasky void mlx5_trigger_health_work(struct mlx5_core_dev *dev) 4624bb7662bSHans Petter Selasky { 4634bb7662bSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 4644bb7662bSHans Petter Selasky unsigned long flags; 4654bb7662bSHans Petter Selasky 4664bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 4674bb7662bSHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) 4684bb7662bSHans Petter Selasky queue_work(health->wq, &health->work); 4694bb7662bSHans Petter Selasky else 470a2f4f59cSHans Petter Selasky mlx5_core_err(dev, 4714bb7662bSHans Petter Selasky "new health works are not permitted at this stage\n"); 4724bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 4734bb7662bSHans Petter Selasky } 4744bb7662bSHans Petter Selasky 475dc7e38acSHans Petter Selasky static const char *hsynd_str(u8 synd) 476dc7e38acSHans Petter Selasky { 477dc7e38acSHans Petter Selasky switch (synd) { 478dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FW_ERR: 479dc7e38acSHans Petter Selasky return "firmware internal error"; 480dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_IRISC_ERR: 481dc7e38acSHans Petter Selasky return "irisc not responding"; 482a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: 483a2485fe5SHans Petter Selasky return "unrecoverable hardware error"; 484dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_CRC_ERR: 485dc7e38acSHans Petter Selasky return "firmware CRC error"; 486dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 487dc7e38acSHans Petter Selasky return "ICM fetch PCI error"; 488dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 489dc7e38acSHans Petter Selasky return "HW fatal error\n"; 490dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 491dc7e38acSHans Petter Selasky return "async EQ buffer overrun"; 492dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_EQ_ERR: 493dc7e38acSHans Petter Selasky return "EQ error"; 494a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_EQ_INV: 495a2485fe5SHans Petter Selasky return "Invalid EQ referenced"; 496dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FFSER_ERR: 497dc7e38acSHans Petter Selasky return "FFSER error"; 498a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_HIGH_TEMP: 49987b3c8ccSHans Petter Selasky return "High temperature"; 500dc7e38acSHans Petter Selasky default: 501dc7e38acSHans Petter Selasky return "unrecognized error"; 502dc7e38acSHans Petter Selasky } 503dc7e38acSHans Petter Selasky } 504dc7e38acSHans Petter Selasky 505c9bb26aeSHans Petter Selasky static u8 506c9bb26aeSHans Petter Selasky print_health_info(struct mlx5_core_dev *dev) 507dc7e38acSHans Petter Selasky { 508dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 509dc7e38acSHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health; 510c9bb26aeSHans Petter Selasky u8 synd = ioread8(&h->synd); 511a2485fe5SHans Petter Selasky char fw_str[18]; 512a2485fe5SHans Petter Selasky u32 fw; 513dc7e38acSHans Petter Selasky int i; 514dc7e38acSHans Petter Selasky 515c9bb26aeSHans Petter Selasky /* 516c9bb26aeSHans Petter Selasky * If synd is 0x0 - this indicates that FW is unable to 517c9bb26aeSHans Petter Selasky * respond to initialization segment reads and health buffer 518c9bb26aeSHans Petter Selasky * should not be read. 519c9bb26aeSHans Petter Selasky */ 520c9bb26aeSHans Petter Selasky if (synd == 0) 521c9bb26aeSHans Petter Selasky return (0); 522dc7e38acSHans Petter Selasky 523a2485fe5SHans Petter Selasky for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) 524a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "assert_var[%d] 0x%08x\n", i, 525a2f4f59cSHans Petter Selasky ioread32be(h->assert_var + i)); 526a2485fe5SHans Petter Selasky 527a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "assert_exit_ptr 0x%08x\n", 528a2f4f59cSHans Petter Selasky ioread32be(&h->assert_exit_ptr)); 529a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "assert_callra 0x%08x\n", 530a2f4f59cSHans Petter Selasky ioread32be(&h->assert_callra)); 531a2f4f59cSHans Petter Selasky snprintf(fw_str, sizeof(fw_str), "%d.%d.%d", 532a2f4f59cSHans Petter Selasky fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); 533a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "fw_ver %s\n", fw_str); 534a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); 535a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "irisc_index %d\n", ioread8(&h->irisc_index)); 536a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "synd 0x%x: %s\n", 537a2f4f59cSHans Petter Selasky ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); 538a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); 539a2485fe5SHans Petter Selasky fw = ioread32be(&h->fw_ver); 540a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "raw fw_ver 0x%08x\n", fw); 541c9bb26aeSHans Petter Selasky 542c9bb26aeSHans Petter Selasky return synd; 543dc7e38acSHans Petter Selasky } 544dc7e38acSHans Petter Selasky 545adb6fd50SHans Petter Selasky static void health_watchdog(struct work_struct *work) 546adb6fd50SHans Petter Selasky { 547adb6fd50SHans Petter Selasky struct mlx5_core_dev *dev; 548adb6fd50SHans Petter Selasky u16 power; 549adb6fd50SHans Petter Selasky u8 status; 550adb6fd50SHans Petter Selasky int err; 551adb6fd50SHans Petter Selasky 552adb6fd50SHans Petter Selasky dev = container_of(work, struct mlx5_core_dev, priv.health.work_watchdog); 553adb6fd50SHans Petter Selasky 554adb6fd50SHans Petter Selasky if (!MLX5_CAP_GEN(dev, mcam_reg) || 555adb6fd50SHans Petter Selasky !MLX5_CAP_MCAM_FEATURE(dev, pcie_status_and_power)) 556adb6fd50SHans Petter Selasky return; 557adb6fd50SHans Petter Selasky 558adb6fd50SHans Petter Selasky err = mlx5_pci_read_power_status(dev, &power, &status); 559adb6fd50SHans Petter Selasky if (err < 0) { 560a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, "Failed reading power status: %d\n", 561a2f4f59cSHans Petter Selasky err); 562adb6fd50SHans Petter Selasky return; 563adb6fd50SHans Petter Selasky } 564adb6fd50SHans Petter Selasky 565adb6fd50SHans Petter Selasky dev->pwr_value = power; 566adb6fd50SHans Petter Selasky 567adb6fd50SHans Petter Selasky if (dev->pwr_status != status) { 568adb6fd50SHans Petter Selasky 569adb6fd50SHans Petter Selasky switch (status) { 570adb6fd50SHans Petter Selasky case 0: 571adb6fd50SHans Petter Selasky dev->pwr_status = status; 572a2f4f59cSHans Petter Selasky mlx5_core_info(dev, 573a2f4f59cSHans Petter Selasky "PCI power is not published by the PCIe slot.\n"); 574adb6fd50SHans Petter Selasky break; 575adb6fd50SHans Petter Selasky case 1: 576adb6fd50SHans Petter Selasky dev->pwr_status = status; 577a2f4f59cSHans Petter Selasky mlx5_core_info(dev, 578a2f4f59cSHans Petter Selasky "PCIe slot advertised sufficient power (%uW).\n", 579a2f4f59cSHans Petter Selasky power); 580adb6fd50SHans Petter Selasky break; 581adb6fd50SHans Petter Selasky case 2: 582adb6fd50SHans Petter Selasky dev->pwr_status = status; 583a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 584a2f4f59cSHans Petter Selasky "Detected insufficient power on the PCIe slot (%uW).\n", 585a2f4f59cSHans Petter Selasky power); 586adb6fd50SHans Petter Selasky break; 587adb6fd50SHans Petter Selasky default: 588adb6fd50SHans Petter Selasky dev->pwr_status = 0; 589a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, 590a2f4f59cSHans Petter Selasky "Unknown power state detected(%d).\n", 591a2f4f59cSHans Petter Selasky status); 592adb6fd50SHans Petter Selasky break; 593adb6fd50SHans Petter Selasky } 594adb6fd50SHans Petter Selasky } 595adb6fd50SHans Petter Selasky } 596adb6fd50SHans Petter Selasky 597adb6fd50SHans Petter Selasky void 598adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(struct mlx5_core_dev *dev) 599adb6fd50SHans Petter Selasky { 600adb6fd50SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 601adb6fd50SHans Petter Selasky unsigned long flags; 602adb6fd50SHans Petter Selasky 603adb6fd50SHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 604adb6fd50SHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags)) 605adb6fd50SHans Petter Selasky queue_work(health->wq_watchdog, &health->work_watchdog); 606adb6fd50SHans Petter Selasky else 607a2f4f59cSHans Petter Selasky mlx5_core_err(dev, 608adb6fd50SHans Petter Selasky "scheduling watchdog is not permitted at this stage\n"); 609adb6fd50SHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 610adb6fd50SHans Petter Selasky } 611adb6fd50SHans Petter Selasky 61203ab395eSHans Petter Selasky static void poll_health(unsigned long data) 613dc7e38acSHans Petter Selasky { 614dc7e38acSHans Petter Selasky struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 615dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 6161900b6f8SHans Petter Selasky u32 fatal_error; 617dc7e38acSHans Petter Selasky u32 count; 618dc7e38acSHans Petter Selasky 61930dfc051SHans Petter Selasky if (dev->state != MLX5_DEVICE_STATE_UP) 62030dfc051SHans Petter Selasky return; 62130dfc051SHans Petter Selasky 622dc7e38acSHans Petter Selasky count = ioread32be(health->health_counter); 623dc7e38acSHans Petter Selasky if (count == health->prev) 624dc7e38acSHans Petter Selasky ++health->miss_counter; 625dc7e38acSHans Petter Selasky else 626dc7e38acSHans Petter Selasky health->miss_counter = 0; 627dc7e38acSHans Petter Selasky 628dc7e38acSHans Petter Selasky health->prev = count; 629dc7e38acSHans Petter Selasky if (health->miss_counter == MAX_MISSES) { 630a2485fe5SHans Petter Selasky mlx5_core_err(dev, "device's health compromised - reached miss count\n"); 631c9bb26aeSHans Petter Selasky if (print_health_info(dev) == 0) 632c9bb26aeSHans Petter Selasky mlx5_core_err(dev, "FW is unable to respond to initialization segment reads\n"); 633a2485fe5SHans Petter Selasky } 634a2485fe5SHans Petter Selasky 6351900b6f8SHans Petter Selasky fatal_error = check_fatal_sensors(dev); 6361900b6f8SHans Petter Selasky 6371900b6f8SHans Petter Selasky if (fatal_error && !health->fatal_error) { 638a2f4f59cSHans Petter Selasky mlx5_core_err(dev, 639a2f4f59cSHans Petter Selasky "Fatal error %u detected\n", fatal_error); 6401900b6f8SHans Petter Selasky dev->priv.health.fatal_error = fatal_error; 641a2485fe5SHans Petter Selasky print_health_info(dev); 6424bb7662bSHans Petter Selasky mlx5_trigger_health_work(dev); 643dc7e38acSHans Petter Selasky } 6444bb7662bSHans Petter Selasky 6454bb7662bSHans Petter Selasky mod_timer(&health->timer, get_next_poll_jiffies()); 646dc7e38acSHans Petter Selasky } 647dc7e38acSHans Petter Selasky 648dc7e38acSHans Petter Selasky void mlx5_start_health_poll(struct mlx5_core_dev *dev) 649dc7e38acSHans Petter Selasky { 650dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 651dc7e38acSHans Petter Selasky 652dc7e38acSHans Petter Selasky init_timer(&health->timer); 6531900b6f8SHans Petter Selasky health->fatal_error = MLX5_SENSOR_NO_ERR; 654ca551594SHans Petter Selasky clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 655519774eaSHans Petter Selasky clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 656adb6fd50SHans Petter Selasky clear_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags); 657dc7e38acSHans Petter Selasky health->health = &dev->iseg->health; 658dc7e38acSHans Petter Selasky health->health_counter = &dev->iseg->health_counter; 659dc7e38acSHans Petter Selasky 66003ab395eSHans Petter Selasky setup_timer(&health->timer, poll_health, (unsigned long)dev); 661dc7e38acSHans Petter Selasky mod_timer(&health->timer, 662dc7e38acSHans Petter Selasky round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL)); 663adb6fd50SHans Petter Selasky 664adb6fd50SHans Petter Selasky /* do initial PCI power state readout */ 665adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(dev); 666dc7e38acSHans Petter Selasky } 667dc7e38acSHans Petter Selasky 6682119f825SSlava Shwartsman void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) 669dc7e38acSHans Petter Selasky { 670dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 6712119f825SSlava Shwartsman unsigned long flags; 6722119f825SSlava Shwartsman 6732119f825SSlava Shwartsman if (disable_health) { 6742119f825SSlava Shwartsman spin_lock_irqsave(&health->wq_lock, flags); 6752119f825SSlava Shwartsman set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 6762119f825SSlava Shwartsman set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 677adb6fd50SHans Petter Selasky set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags); 6782119f825SSlava Shwartsman spin_unlock_irqrestore(&health->wq_lock, flags); 6792119f825SSlava Shwartsman } 680dc7e38acSHans Petter Selasky 681dc7e38acSHans Petter Selasky del_timer_sync(&health->timer); 682dc7e38acSHans Petter Selasky } 683dc7e38acSHans Petter Selasky 684ca551594SHans Petter Selasky void mlx5_drain_health_wq(struct mlx5_core_dev *dev) 685ca551594SHans Petter Selasky { 686ca551594SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 6874bb7662bSHans Petter Selasky unsigned long flags; 688ca551594SHans Petter Selasky 6894bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 690ca551594SHans Petter Selasky set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 691519774eaSHans Petter Selasky set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 692adb6fd50SHans Petter Selasky set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags); 6934bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 6944bb7662bSHans Petter Selasky cancel_delayed_work_sync(&health->recover_work); 695ca551594SHans Petter Selasky cancel_work_sync(&health->work); 696adb6fd50SHans Petter Selasky cancel_work_sync(&health->work_watchdog); 697ca551594SHans Petter Selasky } 698ca551594SHans Petter Selasky 699519774eaSHans Petter Selasky void mlx5_drain_health_recovery(struct mlx5_core_dev *dev) 700519774eaSHans Petter Selasky { 701519774eaSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 702519774eaSHans Petter Selasky unsigned long flags; 703519774eaSHans Petter Selasky 704519774eaSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 705519774eaSHans Petter Selasky set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 706519774eaSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 707519774eaSHans Petter Selasky cancel_delayed_work_sync(&dev->priv.health.recover_work); 708519774eaSHans Petter Selasky } 709519774eaSHans Petter Selasky 710a2485fe5SHans Petter Selasky void mlx5_health_cleanup(struct mlx5_core_dev *dev) 711dc7e38acSHans Petter Selasky { 712a2485fe5SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 713a2485fe5SHans Petter Selasky 714a2485fe5SHans Petter Selasky destroy_workqueue(health->wq); 71540218d73SHans Petter Selasky destroy_workqueue(health->wq_watchdog); 7168d1eeedbSHans Petter Selasky destroy_workqueue(health->wq_cmd); 717dc7e38acSHans Petter Selasky } 718dc7e38acSHans Petter Selasky 719a2485fe5SHans Petter Selasky int mlx5_health_init(struct mlx5_core_dev *dev) 720dc7e38acSHans Petter Selasky { 721a2485fe5SHans Petter Selasky struct mlx5_core_health *health; 72240218d73SHans Petter Selasky char name[64]; 723dc7e38acSHans Petter Selasky 724a2485fe5SHans Petter Selasky health = &dev->priv.health; 725a2485fe5SHans Petter Selasky 72640218d73SHans Petter Selasky snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev)); 727a2485fe5SHans Petter Selasky health->wq = create_singlethread_workqueue(name); 728a2485fe5SHans Petter Selasky if (!health->wq) 7298d1eeedbSHans Petter Selasky goto err_recovery; 730a2485fe5SHans Petter Selasky 73140218d73SHans Petter Selasky snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev)); 73240218d73SHans Petter Selasky health->wq_watchdog = create_singlethread_workqueue(name); 7338d1eeedbSHans Petter Selasky if (!health->wq_watchdog) 7348d1eeedbSHans Petter Selasky goto err_watchdog; 7358d1eeedbSHans Petter Selasky 7368d1eeedbSHans Petter Selasky snprintf(name, sizeof(name), "%s-cmd", dev_name(&dev->pdev->dev)); 7378d1eeedbSHans Petter Selasky health->wq_cmd = create_singlethread_workqueue(name); 7388d1eeedbSHans Petter Selasky if (!health->wq_cmd) 7398d1eeedbSHans Petter Selasky goto err_cmd; 74040218d73SHans Petter Selasky 741ca551594SHans Petter Selasky spin_lock_init(&health->wq_lock); 742a2485fe5SHans Petter Selasky INIT_WORK(&health->work, health_care); 743adb6fd50SHans Petter Selasky INIT_WORK(&health->work_watchdog, health_watchdog); 744a0a4fd77SHans Petter Selasky INIT_WORK(&health->work_cmd_completion, mlx5_trigger_cmd_completions); 7454bb7662bSHans Petter Selasky INIT_DELAYED_WORK(&health->recover_work, health_recover); 746a2485fe5SHans Petter Selasky 747a2485fe5SHans Petter Selasky return 0; 7488d1eeedbSHans Petter Selasky 7498d1eeedbSHans Petter Selasky err_cmd: 7508d1eeedbSHans Petter Selasky destroy_workqueue(health->wq_watchdog); 7518d1eeedbSHans Petter Selasky err_watchdog: 7528d1eeedbSHans Petter Selasky destroy_workqueue(health->wq); 7538d1eeedbSHans Petter Selasky err_recovery: 7548d1eeedbSHans Petter Selasky return -ENOMEM; 755dc7e38acSHans Petter Selasky } 756