1 /*- 2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 * $FreeBSD$ 26 */ 27 28 #include <linux/kernel.h> 29 #include <linux/module.h> 30 #include <linux/random.h> 31 #include <linux/vmalloc.h> 32 #include <linux/hardirq.h> 33 #include <dev/mlx5/driver.h> 34 #include <dev/mlx5/mlx5_ifc.h> 35 #include "mlx5_core.h" 36 37 #define MLX5_HEALTH_POLL_INTERVAL (2 * HZ) 38 #define MAX_MISSES 3 39 40 enum { 41 MLX5_NIC_IFC_FULL = 0, 42 MLX5_NIC_IFC_DISABLED = 1, 43 MLX5_NIC_IFC_NO_DRAM_NIC = 2 44 }; 45 46 static u8 get_nic_interface(struct mlx5_core_dev *dev) 47 { 48 return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; 49 } 50 51 static void mlx5_trigger_cmd_completions(struct mlx5_core_dev *dev) 52 { 53 unsigned long flags; 54 u64 vector; 55 56 /* wait for pending handlers to complete */ 57 synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); 58 spin_lock_irqsave(&dev->cmd.alloc_lock, flags); 59 vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); 60 if (!vector) 61 goto no_trig; 62 63 vector |= MLX5_TRIGGERED_CMD_COMP; 64 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 65 66 mlx5_core_dbg(dev, "vector 0x%lx\n", vector); 67 mlx5_cmd_comp_handler(dev, vector); 68 return; 69 70 no_trig: 71 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 72 } 73 74 static int in_fatal(struct mlx5_core_dev *dev) 75 { 76 struct mlx5_core_health *health = &dev->priv.health; 77 struct mlx5_health_buffer __iomem *h = health->health; 78 79 if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED) 80 return 1; 81 82 if (ioread32be(&h->fw_ver) == 0xffffffff) 83 return 1; 84 85 return 0; 86 } 87 88 void mlx5_enter_error_state(struct mlx5_core_dev *dev) 89 { 90 mutex_lock(&dev->intf_state_mutex); 91 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { 92 goto unlock; 93 return; 94 } 95 96 mlx5_core_err(dev, "start\n"); 97 if (pci_channel_offline(dev->pdev) || in_fatal(dev)) { 98 dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; 99 mlx5_trigger_cmd_completions(dev); 100 } 101 102 mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0); 103 mlx5_core_err(dev, "end\n"); 104 105 unlock: 106 mutex_unlock(&dev->intf_state_mutex); 107 } 108 109 static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) 110 { 111 u8 nic_interface = get_nic_interface(dev); 112 113 switch (nic_interface) { 114 case MLX5_NIC_IFC_FULL: 115 mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n"); 116 break; 117 118 case MLX5_NIC_IFC_DISABLED: 119 mlx5_core_warn(dev, "starting teardown\n"); 120 break; 121 122 case MLX5_NIC_IFC_NO_DRAM_NIC: 123 mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); 124 break; 125 default: 126 mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", 127 nic_interface); 128 } 129 130 mlx5_disable_device(dev); 131 } 132 133 static void health_care(struct work_struct *work) 134 { 135 struct mlx5_core_health *health; 136 struct mlx5_core_dev *dev; 137 struct mlx5_priv *priv; 138 139 health = container_of(work, struct mlx5_core_health, work); 140 priv = container_of(health, struct mlx5_priv, health); 141 dev = container_of(priv, struct mlx5_core_dev, priv); 142 mlx5_core_warn(dev, "handling bad device here\n"); 143 mlx5_handle_bad_state(dev); 144 } 145 146 static int get_next_poll_jiffies(void) 147 { 148 unsigned long next; 149 150 get_random_bytes(&next, sizeof(next)); 151 next %= HZ; 152 next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 153 154 return next; 155 } 156 157 static const char *hsynd_str(u8 synd) 158 { 159 switch (synd) { 160 case MLX5_HEALTH_SYNDR_FW_ERR: 161 return "firmware internal error"; 162 case MLX5_HEALTH_SYNDR_IRISC_ERR: 163 return "irisc not responding"; 164 case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: 165 return "unrecoverable hardware error"; 166 case MLX5_HEALTH_SYNDR_CRC_ERR: 167 return "firmware CRC error"; 168 case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 169 return "ICM fetch PCI error"; 170 case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 171 return "HW fatal error\n"; 172 case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 173 return "async EQ buffer overrun"; 174 case MLX5_HEALTH_SYNDR_EQ_ERR: 175 return "EQ error"; 176 case MLX5_HEALTH_SYNDR_EQ_INV: 177 return "Invalid EQ referenced"; 178 case MLX5_HEALTH_SYNDR_FFSER_ERR: 179 return "FFSER error"; 180 case MLX5_HEALTH_SYNDR_HIGH_TEMP: 181 return "High temprature"; 182 default: 183 return "unrecognized error"; 184 } 185 } 186 187 static void print_health_info(struct mlx5_core_dev *dev) 188 { 189 struct mlx5_core_health *health = &dev->priv.health; 190 struct mlx5_health_buffer __iomem *h = health->health; 191 char fw_str[18]; 192 u32 fw; 193 int i; 194 195 /* If the syndrom is 0, the device is OK and no need to print buffer */ 196 if (!ioread8(&h->synd)) 197 return; 198 199 for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) 200 printf("mlx5_core: INFO: ""assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); 201 202 printf("mlx5_core: INFO: ""assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); 203 printf("mlx5_core: INFO: ""assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); 204 snprintf(fw_str, sizeof(fw_str), "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); 205 printf("mlx5_core: INFO: ""fw_ver %s\n", fw_str); 206 printf("mlx5_core: INFO: ""hw_id 0x%08x\n", ioread32be(&h->hw_id)); 207 printf("mlx5_core: INFO: ""irisc_index %d\n", ioread8(&h->irisc_index)); 208 printf("mlx5_core: INFO: ""synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); 209 printf("mlx5_core: INFO: ""ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); 210 fw = ioread32be(&h->fw_ver); 211 printf("mlx5_core: INFO: ""raw fw_ver 0x%08x\n", fw); 212 } 213 214 static void poll_health(unsigned long data) 215 { 216 struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 217 struct mlx5_core_health *health = &dev->priv.health; 218 u32 count; 219 220 if (dev->state != MLX5_DEVICE_STATE_UP) 221 return; 222 223 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { 224 mod_timer(&health->timer, get_next_poll_jiffies()); 225 return; 226 } 227 228 count = ioread32be(health->health_counter); 229 if (count == health->prev) 230 ++health->miss_counter; 231 else 232 health->miss_counter = 0; 233 234 health->prev = count; 235 if (health->miss_counter == MAX_MISSES) { 236 mlx5_core_err(dev, "device's health compromised - reached miss count\n"); 237 print_health_info(dev); 238 } else { 239 mod_timer(&health->timer, get_next_poll_jiffies()); 240 } 241 242 if (in_fatal(dev) && !health->sick) { 243 health->sick = true; 244 print_health_info(dev); 245 queue_work(health->wq, &health->work); 246 } 247 } 248 249 void mlx5_start_health_poll(struct mlx5_core_dev *dev) 250 { 251 struct mlx5_core_health *health = &dev->priv.health; 252 253 init_timer(&health->timer); 254 health->sick = 0; 255 health->health = &dev->iseg->health; 256 health->health_counter = &dev->iseg->health_counter; 257 258 setup_timer(&health->timer, poll_health, (unsigned long)dev); 259 mod_timer(&health->timer, 260 round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL)); 261 } 262 263 void mlx5_stop_health_poll(struct mlx5_core_dev *dev) 264 { 265 struct mlx5_core_health *health = &dev->priv.health; 266 267 del_timer_sync(&health->timer); 268 } 269 270 void mlx5_health_cleanup(struct mlx5_core_dev *dev) 271 { 272 struct mlx5_core_health *health = &dev->priv.health; 273 274 destroy_workqueue(health->wq); 275 } 276 277 #define HEALTH_NAME "mlx5_health" 278 int mlx5_health_init(struct mlx5_core_dev *dev) 279 { 280 struct mlx5_core_health *health; 281 char *name; 282 int len; 283 284 health = &dev->priv.health; 285 len = strlen(HEALTH_NAME) + strlen(dev_name(&dev->pdev->dev)); 286 name = kmalloc(len + 1, GFP_KERNEL); 287 if (!name) 288 return -ENOMEM; 289 290 snprintf(name, len, "%s:%s", HEALTH_NAME, dev_name(&dev->pdev->dev)); 291 health->wq = create_singlethread_workqueue(name); 292 kfree(name); 293 if (!health->wq) 294 return -ENOMEM; 295 296 INIT_WORK(&health->work, health_care); 297 298 return 0; 299 } 300